{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from os import listdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def summary_file(file_name,name):\n",
    "    df = pd.read_csv(file_name,sep=',',skiprows=1,header=None)\n",
    "    df.columns = [\"kmer\",\"Met\",\"Unmet\",\"Total\",\"Perc_Met\"]\n",
    "    kmer_temp = df.loc[:,[\"kmer\",\"Total\",\"Perc_Met\"]]\n",
    "    met_sum = df.loc[:,\"Met\"].sum()\n",
    "    tot_sum = df.loc[:,\"Total\"].sum()\n",
    "    per_met = met_sum*100/tot_sum\n",
    "    kmer_temp.loc[\"Sum\"] = [\"Total\", tot_sum, per_met]\n",
    "    kmer_temp[\"Perc_Nor\"] = kmer_temp[\"Perc_Met\"] - per_met\n",
    "    kmer_temp.columns = [\"kmer\",str(name)+\"_Total\",str(name)+\"_Perc_met\",str(name)+\"Perc_Norm\"]\n",
    "    return(kmer_temp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CGI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = listdir(\"Claudia_CGI_output\")\n",
    "inputs = []\n",
    "for i in range(len(files)):\n",
    "    if files[i].startswith(\"CGI\") == True:\n",
    "        inputs.append(files[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = []\n",
    "for i in range(len(inputs)):\n",
    "    name = inputs[i].replace('CpG_context_','')\n",
    "    name = name.replace('_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv.csv','')\n",
    "    names.append(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_input='Claudia_CGI_output'\n",
    "\n",
    "output_file = summary_file(str(folder_input)+\"/\"+str(inputs[0]),names[0])\n",
    "for i in range(1,len(inputs)):\n",
    "    temp_output_file = summary_file(str(folder_input)+\"/\"+str(inputs[i]),names[i])\n",
    "    output_file = output_file.merge(temp_output_file, left_on='kmer', right_on='kmer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_t = output_file.T\n",
    "output_file_t.columns = output_file_t.iloc[0]\n",
    "output_file_t = output_file_t.drop(output_file_t.index[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "output_file_total.T.to_csv(\"Motif_calls_TET3_84h_CGI_8June2020.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "output_file_meth.T.to_csv(\"Motif_meth_TET3_84h_CGI_8June2020.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "output_file_norm.T.to_csv(\"Motif_norm_TET3_84h_CGI_8June2020.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# non CGI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = listdir(\"Claudia_CGI_output\")\n",
    "inputs = []\n",
    "for i in range(len(files)):\n",
    "    if files[i].startswith(\"non_CGI\") == True:\n",
    "        inputs.append(files[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = []\n",
    "for i in range(len(inputs)):\n",
    "    name = inputs[i].replace('CpG_context_','')\n",
    "    name = name.replace('_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv.csv','')\n",
    "    names.append(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_input='Claudia_CGI_output'\n",
    "\n",
    "output_file = summary_file(str(folder_input)+\"/\"+str(inputs[0]),names[0])\n",
    "for i in range(1,len(inputs)):\n",
    "    temp_output_file = summary_file(str(folder_input)+\"/\"+str(inputs[i]),names[i])\n",
    "    output_file = output_file.merge(temp_output_file, left_on='kmer', right_on='kmer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_t = output_file.T\n",
    "output_file_t.columns = output_file_t.iloc[0]\n",
    "output_file_t = output_file_t.drop(output_file_t.index[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "output_file_total.T.to_csv(\"Motif_calls_TET3_84h_non_CGI_8June2020.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "output_file_meth.T.to_csv(\"Motif_meth_TET3_84h_non_CGI_8June2020.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "output_file_norm.T.to_csv(\"Motif_norm_TET3_84h_non_CGI_8June2020.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Rosie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = listdir(\"Rosie_CGI_output\")\n",
    "inputs = []\n",
    "for i in range(len(files)):\n",
    "    if files[i].startswith(\"CGI\") == True:\n",
    "        inputs.append(files[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = []\n",
    "for i in range(len(inputs)):\n",
    "    name = inputs[i].replace('CpG_context_','')\n",
    "    name = name.replace('_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv.csv','')\n",
    "    names.append(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_input='Rosie_CGI_output'\n",
    "\n",
    "output_file = summary_file(str(folder_input)+\"/\"+str(inputs[0]),names[0])\n",
    "for i in range(1,len(inputs)):\n",
    "    temp_output_file = summary_file(str(folder_input)+\"/\"+str(inputs[i]),names[i])\n",
    "    output_file = output_file.merge(temp_output_file, left_on='kmer', right_on='kmer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "output_file_total.T.to_csv(\"Motif_calls_Decitabine_CGI_10June2020.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "output_file_meth.T.to_csv(\"Motif_meth_Decitabine_CGI_10June2020.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "output_file_norm.T.to_csv(\"Motif_norm_Decitabine_CGI_10June2020.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# non CGI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = listdir(\"Rosie_CGI_output\")\n",
    "inputs = []\n",
    "for i in range(len(files)):\n",
    "    if files[i].startswith(\"non_CGI\") == True:\n",
    "        inputs.append(files[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = []\n",
    "for i in range(len(inputs)):\n",
    "    name = inputs[i].replace('CpG_context_','')\n",
    "    name = name.replace('_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv.csv','')\n",
    "    names.append(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_input='Rosie_CGI_output'\n",
    "\n",
    "output_file = summary_file(str(folder_input)+\"/\"+str(inputs[0]),names[0])\n",
    "for i in range(1,len(inputs)):\n",
    "    temp_output_file = summary_file(str(folder_input)+\"/\"+str(inputs[i]),names[i])\n",
    "    output_file = output_file.merge(temp_output_file, left_on='kmer', right_on='kmer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "output_file_total.T.to_csv(\"Motif_calls_Decitabine_non_CGI_10June2020.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "output_file_meth.T.to_csv(\"Motif_meth_Decitabine_non_CGI_10June2020.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "output_file_norm.T.to_csv(\"Motif_norm_Decitabine_non_CGI_10June2020.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TET"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = listdir(\"TET3_data\")\n",
    "inputs = []\n",
    "for i in range(len(files)):\n",
    "    if files[i].startswith(\"CGI\") == True:\n",
    "        inputs.append(files[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CGI_TET3-TKO_0.5A-ve_10055.csv',\n",
       " 'CGI_TET3-TKO_0.5Apos_10062.csv',\n",
       " 'CGI_TET3-TKO_0.5B-ve_10060.csv',\n",
       " 'CGI_TET3-TKO_0.5C-ve_10123.csv',\n",
       " 'CGI_TET3-TKO_0.5Cpos_10076.csv',\n",
       " 'CGI_TET3-TKO_0_A8497_14725.csv',\n",
       " 'CGI_TET3-TKO_0_B9239_10686.csv',\n",
       " 'CGI_TET3-TKO_0_C10031_10000.csv',\n",
       " 'CGI_TET3-TKO_1.5A-ve_10054.csv',\n",
       " 'CGI_TET3-TKO_1.5Apos_10092.csv',\n",
       " 'CGI_TET3-TKO_1.5B-ve_10147.csv',\n",
       " 'CGI_TET3-TKO_1.5Bpos_10053.csv',\n",
       " 'CGI_TET3-TKO_1.5C-ve_10056.csv',\n",
       " 'CGI_TET3-TKO_1.5Cpos_10003.csv',\n",
       " 'CGI_TET3-TKO_1A-ve_10090.csv',\n",
       " 'CGI_TET3-TKO_1Apos_10054.csv',\n",
       " 'CGI_TET3-TKO_1B-ve_10037.csv',\n",
       " 'CGI_TET3-TKO_1Bpos_10014.csv',\n",
       " 'CGI_TET3-TKO_1C-ve_10077.csv',\n",
       " 'CGI_TET3-TKO_1Cpos_10058.csv',\n",
       " 'CGI_TET3-TKO_2A-ve_10031.csv',\n",
       " 'CGI_TET3-TKO_2Apos_10042.csv',\n",
       " 'CGI_TET3-TKO_2Bpos_10048.csv',\n",
       " 'CGI_TET3-TKO_2C-ve_9874.csv',\n",
       " 'CGI_TET3-TKO_2Cpos_9970.csv',\n",
       " 'CGI_TET3-TKO_3Apos_10137.csv',\n",
       " 'CGI_TET3-TKO_3B-ve_8497.csv',\n",
       " 'CGI_TET3-TKO_3Bpos_9436.csv',\n",
       " 'CGI_TET3-TKO_3C-ve_9239.csv',\n",
       " 'CGI_TET3-TKO_3Cpos_10035.csv',\n",
       " 'CGI_TET3-TKO_NoAscB9874_11166.csv',\n",
       " 'CGI_TET3-TKO_NoAscC10054_10465.csv']"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = []\n",
    "for i in range(len(inputs)):\n",
    "    name = inputs[i].replace('CpG_context_','')\n",
    "    name = name.replace('.csv','')\n",
    "    names.append(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_input='TET3_data'\n",
    "\n",
    "output_file = summary_file(str(folder_input)+\"/\"+str(inputs[0]),names[0])\n",
    "for i in range(1,len(inputs)):\n",
    "    temp_output_file = summary_file(str(folder_input)+\"/\"+str(inputs[i]),names[i])\n",
    "    output_file = output_file.merge(temp_output_file, left_on='kmer', right_on='kmer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_t = output_file.T\n",
    "output_file_t.columns = output_file_t.iloc[0]\n",
    "output_file_t = output_file_t.drop(output_file_t.index[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "output_file_total.T.to_csv(\"Motif_calls_TET3_old_CGI_8June2020.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "output_file_meth.T.to_csv(\"Motif_meth_TET3_old_CGI_8June2020.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "output_file_norm.T.to_csv(\"Motif_norm_TET3_old_CGI_8June2020.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# non CGI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = listdir(\"TET3_data\")\n",
    "inputs = []\n",
    "for i in range(len(files)):\n",
    "    if files[i].startswith(\"non_CGI\") == True:\n",
    "        inputs.append(files[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = []\n",
    "for i in range(len(inputs)):\n",
    "    name = inputs[i].replace('CpG_context_','')\n",
    "    name = name.replace('_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv.csv','')\n",
    "    names.append(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_input='TET3_data'\n",
    "\n",
    "output_file = summary_file(str(folder_input)+\"/\"+str(inputs[0]),names[0])\n",
    "for i in range(1,len(inputs)):\n",
    "    temp_output_file = summary_file(str(folder_input)+\"/\"+str(inputs[i]),names[i])\n",
    "    output_file = output_file.merge(temp_output_file, left_on='kmer', right_on='kmer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_t = output_file.T\n",
    "output_file_t.columns = output_file_t.iloc[0]\n",
    "output_file_t = output_file_t.drop(output_file_t.index[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "output_file_total.T.to_csv(\"Motif_calls_TET3_old_non_CGI_8June2020.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "output_file_meth.T.to_csv(\"Motif_meth_TET3_old_non_CGI_8June2020.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "output_file_norm.T.to_csv(\"Motif_norm_TET3_old_non_CGI_8June2020.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
